R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

library(ISLR)

Basic Commands

x <- c(1,3,2,5)
x
## [1] 1 3 2 5
x = c(1,6,2)
x
## [1] 1 6 2
y= c(1,4,3)
length(x)
## [1] 3
length(y)
## [1] 3
x+y
## [1]  2 10  5
ls()
## [1] "x" "y"
rm(x,y)
ls()
## character(0)
rm(list=ls())
?matrix
x=matrix(c(1,2,3,4),2,2)
x
##      [,1] [,2]
## [1,]    1    3
## [2,]    2    4
matrix(c(1,2,3,4),2,2, byrow=TRUE)
##      [,1] [,2]
## [1,]    1    2
## [2,]    3    4
sqrt(x)
##          [,1]     [,2]
## [1,] 1.000000 1.732051
## [2,] 1.414214 2.000000
x^2
##      [,1] [,2]
## [1,]    1    9
## [2,]    4   16
x=rnorm(50)
y=rnorm(50,mean=50,sd=.1)
cor(x,y)
## [1] 0.008807874
set.seed(1303)
set.seed(3)
y=rnorm(100)
mean(y)
## [1] 0.01103557
var(y)
## [1] 0.7328675
sqrt(var(y))
## [1] 0.8560768
sd(y)
## [1] 0.8560768

Graphics

x=rnorm(100)
y=rnorm(100)
plot(x,y)

plot(x,y,xlab="this is the x-axis", ylab="this is the y-axis",main="Plot of X vs Y")

pdf("Figure.pdf")
plot(x,y,col="green")
x=seq(1,10)
x
##  [1]  1  2  3  4  5  6  7  8  9 10
x=seq(-pi,pi,length=50)
x
##  [1] -3.14159265 -3.01336438 -2.88513611 -2.75690784 -2.62867957
##  [6] -2.50045130 -2.37222302 -2.24399475 -2.11576648 -1.98753821
## [11] -1.85930994 -1.73108167 -1.60285339 -1.47462512 -1.34639685
## [16] -1.21816858 -1.08994031 -0.96171204 -0.83348377 -0.70525549
## [21] -0.57702722 -0.44879895 -0.32057068 -0.19234241 -0.06411414
## [26]  0.06411414  0.19234241  0.32057068  0.44879895  0.57702722
## [31]  0.70525549  0.83348377  0.96171204  1.08994031  1.21816858
## [36]  1.34639685  1.47462512  1.60285339  1.73108167  1.85930994
## [41]  1.98753821  2.11576648  2.24399475  2.37222302  2.50045130
## [46]  2.62867957  2.75690784  2.88513611  3.01336438  3.14159265
y=x 
f=outer(x,y,function(x,y)cos(y)/(1+x^2))
contour(x,y,f)
contour(x,y,f,nlevels=45,add=T)

fa=(f-t(f))/2
contour(x,y,fa,nlevels=15)

image(x,y,fa)

persp(x,y,fa)

persp(x,y,fa,theta=30,phi=40)

Indexing Data

A=matrix(1:16,4,4)
A
##      [,1] [,2] [,3] [,4]
## [1,]    1    5    9   13
## [2,]    2    6   10   14
## [3,]    3    7   11   15
## [4,]    4    8   12   16
A[2,3]
## [1] 10
A[c(1,3),c(2,4)]
##      [,1] [,2]
## [1,]    5   13
## [2,]    7   15
A[1,]
## [1]  1  5  9 13
A[-c(1,3),]
##      [,1] [,2] [,3] [,4]
## [1,]    2    6   10   14
## [2,]    4    8   12   16
dim(A)
## [1] 4 4

Loading Data

Auto = read.table("Auto.txt", header=T, na.strings="?")
#fix(Auto)
dim(Auto)
## [1] 397   9
names(Auto)
## [1] "mpg"          "cylinders"    "displacement" "horsepower"  
## [5] "weight"       "acceleration" "year"         "origin"      
## [9] "name"
Autona =na.omit(Auto)
dim(Autona)
## [1] 392   9
names(Autona)
## [1] "mpg"          "cylinders"    "displacement" "horsepower"  
## [5] "weight"       "acceleration" "year"         "origin"      
## [9] "name"
plot(Auto$cylinders, Auto$mpg)

attach(Auto)
plot(cylinders,mpg)

plot(cylinders,mpg)

plot(cylinders, mpg, col ="red")

plot(cylinders, mpg, col ="red", varwidth=T)
## Warning in plot.window(...): "varwidth" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "varwidth" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "varwidth" is
## not a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "varwidth" is
## not a graphical parameter
## Warning in box(...): "varwidth" is not a graphical parameter
## Warning in title(...): "varwidth" is not a graphical parameter

plot(cylinders, mpg, col ="red", varwidth=T, horizontal=T)
## Warning in plot.window(...): "varwidth" is not a graphical parameter
## Warning in plot.window(...): "horizontal" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "varwidth" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "horizontal" is not a graphical
## parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "varwidth" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "horizontal"
## is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "varwidth" is
## not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "horizontal"
## is not a graphical parameter
## Warning in box(...): "varwidth" is not a graphical parameter
## Warning in box(...): "horizontal" is not a graphical parameter
## Warning in title(...): "varwidth" is not a graphical parameter
## Warning in title(...): "horizontal" is not a graphical parameter

hist(mpg)

hist(mpg,col=2)

hist(mpg,col=2,breaks=15)

pairs(Auto)

pairs(~mpg + displacement+horsepower+weight+acceleration, Auto)

plot(horsepower,mpg)
identify(horsepower,mpg,name)

## integer(0)
summary(Auto)
##       mpg          cylinders      displacement     horsepower   
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0  
##  1st Qu.:17.50   1st Qu.:4.000   1st Qu.:104.0   1st Qu.: 75.0  
##  Median :23.00   Median :4.000   Median :146.0   Median : 93.5  
##  Mean   :23.52   Mean   :5.458   Mean   :193.5   Mean   :104.5  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:262.0   3rd Qu.:126.0  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0  
##                                                  NA's   :5      
##      weight      acceleration        year           origin     
##  Min.   :1613   Min.   : 8.00   Min.   :70.00   Min.   :1.000  
##  1st Qu.:2223   1st Qu.:13.80   1st Qu.:73.00   1st Qu.:1.000  
##  Median :2800   Median :15.50   Median :76.00   Median :1.000  
##  Mean   :2970   Mean   :15.56   Mean   :75.99   Mean   :1.574  
##  3rd Qu.:3609   3rd Qu.:17.10   3rd Qu.:79.00   3rd Qu.:2.000  
##  Max.   :5140   Max.   :24.80   Max.   :82.00   Max.   :3.000  
##                                                                
##              name    
##  ford pinto    :  6  
##  amc matador   :  5  
##  ford maverick :  5  
##  toyota corolla:  5  
##  amc gremlin   :  4  
##  amc hornet    :  4  
##  (Other)       :368

Excecises

Conceptual

https://raw.githubusercontent.com/asadoughi/stat-learning/master/ch2/answers

  1. For each of parts (a) through (d), indicate whether we would generally expect the performance of a flexible statistical learning method to be better or worse than an inflexible method. Justify your answer.
  1. The sample size n is extremely large, and the number of predictors p is small.
  • better - a more flexible approach will fit the data closer and with the large sample size a better fit than an inflexible approach would be obtained
  1. The number of predictors p is extremely large, and the number of observations n is small.
  • worse - a flexible method would overfit the small number of observations
  1. The relationship between the predictors and response is highly non-linear.
  • better - with more degrees of freedom, a flexible model would obtain a better fit
  1. The variance of the error terms, i.e. σ2 = Var(), is extremely high.
  • worse - flexible methods fit to the noise in the error terms and increase variance
  1. Explain whether each scenario is a classification or regression problem, and indicate whether we are most interested in inference or prediction. Finally, provide n and p.
  1. We collect a set of data on the top 500 firms in the US. For each firm we record profit, number of employees, industry and the CEO salary. We are interested in understanding which factors affect CEO salary.
  • regression. inference. quantitative output of CEO salary based on CEO firm’s features. n - 500 firms in the US p - profit, number of employees, industry
  1. We are considering launching a new product and wish to know whether it will be a success or a failure. We collect data on 20 similar products that were previously launched. For each product we have recorded whether it was a success or failure, price charged for the product, marketing budget, competition price, and ten other variables.
  • classification. prediction. predicting new product’s success or failure. n - 20 similar products previously launched p - price charged, marketing budget, comp. price, ten other variables
  1. We are interest in predicting the % change in the USD/Euro

-regression. prediction. quantitative output of % change n - 52 weeks of 2012 weekly data p - % change in US market, % change in British market, % change in German market

  1. We now revisit the bias-variance decomposition.
  1. Provide a sketch of typical (squared) bias, variance, training error, test error, and Bayes (or irreducible) error curves, on a single plot, as we go from less flexible statistical learning methods towards more flexible approaches. The x-axis should represent the amount of flexibility in the method, and the y-axis should represent the values for each curve. There should be five curves. Make sure to label each one.
  2. Explain why each of the five curves has the shape displayed in part (a).
  1. You will now think of some real-life applications for statistical learning.
  1. Describe three real-life applications in which classification might be useful. Describe the response, as well as the predictors. Is the goal of each application inference or prediction? Explain your answer.
  2. Describe three real-life applications in which regression might be useful. Describe the response, as well as the predictors. Is the goal of each application inference or prediction? Explain your answer.
  3. Describe three real-life applications in which cluster analysis might be useful.
  1. What are the advantages and disadvantages of a very flexible (versus a less flexible) approach for regression or classification? Under what circumstances might a more flexible approach be preferred to a less flexible approach? When might a less flexible approach be preferred?
  • The advantages for a very flexible approach for regression or classification are obtaining a better fit for non-linear models, decreasing bias.

  • The disadvantages for a very flexible approach for regression or classification are requires estimating a greater number of parameters, follow the noise too closely (overfit), increasing variance.

  • A more flexible approach would be preferred to a less flexible approach when we are interested in prediction and not the interpretability of the results.

  • A less flexible approach would be preferred to a more flexible approach when we are interested in inference and the interpretability of the results.

  1. Describe the differences between a parametric and a non-parametric statistical learning approach. What are the advantages of a parametric approach to regression or classification (as opposed to a nonparametric approach)? What are its disadvantages?
  • A parametric approach reduces the problem of estimating f down to one of estimating a set of parameters because it assumes a form for f.

  • A non-parametric approach does not assume a functional form for f and so requires a very large number of observations to accurately estimate f.

  • The advantages of a parametric approach to regression or classification are the simplifying of modeling f to a few parameters and not as many observations are required compared to a non-parametric approach.

  • The disadvantages of a parametric approach to regression or classification are a potential to inaccurately estimate f if the form of f assumed is wrong or to overfit the observations if more flexible models are used.

  1. The table below provides a training data set containing six observations, three predictors, and one qualitative response variable. Obs. X1 X2 X3 Y 1 0 3 0 Red 2 2 0 0 Red 3 0 1 3 Red 4 0 1 2 Green 5 −1 0 1 Green 6 1 1 1 Red Suppose we wish to use this data set to make a prediction for Y when X1 = X2 = X3 = 0 using K-nearest neighbors.
  1. Compute the Euclidean distance between each observation and the test point, X1 = X2 = X3 = 0.
  2. What is our prediction with K = 1? Why?
  3. What is our prediction with K = 3? Why?
  4. If the Bayes decision boundary in this problem is highly nonlinear, then would we expect the best value for K to be large or small? Why?

Applied

names(College)
##  [1] "Private"     "Apps"        "Accept"      "Enroll"      "Top10perc"  
##  [6] "Top25perc"   "F.Undergrad" "P.Undergrad" "Outstate"    "Room.Board" 
## [11] "Books"       "Personal"    "PhD"         "Terminal"    "S.F.Ratio"  
## [16] "perc.alumni" "Expend"      "Grad.Rate"
#fix(College)
#fix(College)
summary(College)
##  Private        Apps           Accept          Enroll       Top10perc    
##  No :212   Min.   :   81   Min.   :   72   Min.   :  35   Min.   : 1.00  
##  Yes:565   1st Qu.:  776   1st Qu.:  604   1st Qu.: 242   1st Qu.:15.00  
##            Median : 1558   Median : 1110   Median : 434   Median :23.00  
##            Mean   : 3002   Mean   : 2019   Mean   : 780   Mean   :27.56  
##            3rd Qu.: 3624   3rd Qu.: 2424   3rd Qu.: 902   3rd Qu.:35.00  
##            Max.   :48094   Max.   :26330   Max.   :6392   Max.   :96.00  
##    Top25perc      F.Undergrad     P.Undergrad         Outstate    
##  Min.   :  9.0   Min.   :  139   Min.   :    1.0   Min.   : 2340  
##  1st Qu.: 41.0   1st Qu.:  992   1st Qu.:   95.0   1st Qu.: 7320  
##  Median : 54.0   Median : 1707   Median :  353.0   Median : 9990  
##  Mean   : 55.8   Mean   : 3700   Mean   :  855.3   Mean   :10441  
##  3rd Qu.: 69.0   3rd Qu.: 4005   3rd Qu.:  967.0   3rd Qu.:12925  
##  Max.   :100.0   Max.   :31643   Max.   :21836.0   Max.   :21700  
##    Room.Board       Books           Personal         PhD        
##  Min.   :1780   Min.   :  96.0   Min.   : 250   Min.   :  8.00  
##  1st Qu.:3597   1st Qu.: 470.0   1st Qu.: 850   1st Qu.: 62.00  
##  Median :4200   Median : 500.0   Median :1200   Median : 75.00  
##  Mean   :4358   Mean   : 549.4   Mean   :1341   Mean   : 72.66  
##  3rd Qu.:5050   3rd Qu.: 600.0   3rd Qu.:1700   3rd Qu.: 85.00  
##  Max.   :8124   Max.   :2340.0   Max.   :6800   Max.   :103.00  
##     Terminal       S.F.Ratio      perc.alumni        Expend     
##  Min.   : 24.0   Min.   : 2.50   Min.   : 0.00   Min.   : 3186  
##  1st Qu.: 71.0   1st Qu.:11.50   1st Qu.:13.00   1st Qu.: 6751  
##  Median : 82.0   Median :13.60   Median :21.00   Median : 8377  
##  Mean   : 79.7   Mean   :14.09   Mean   :22.74   Mean   : 9660  
##  3rd Qu.: 92.0   3rd Qu.:16.50   3rd Qu.:31.00   3rd Qu.:10830  
##  Max.   :100.0   Max.   :39.80   Max.   :64.00   Max.   :56233  
##    Grad.Rate     
##  Min.   : 10.00  
##  1st Qu.: 53.00  
##  Median : 65.00  
##  Mean   : 65.46  
##  3rd Qu.: 78.00  
##  Max.   :118.00
A = College[,1:10]
pairs(A)

We write our model as Y = f(X) + ε, where ε captures measurement errors and other discrepancies. Is there an ideal f(X)? A good value is the regression function: f(x) = E[Y |X = x], which is the optimal predictor of Y with regard to mean-squared prediction error. In order to estimate f, note we typically have few if any data points for X = x exactly. So we cannot compute E[Y |X = x]. Therefore we relax the definition and let ˆf(x) = Ave(Y |X ∈ N (x)) = ∑ i ∑ yi1{xi∈N(x)} i 1{xi∈N(x)} where N (x) is some neighborhood of x. Then Pythagorean theorem says E[(Y − ˆf(X))2 |X = x] = [f(x) − ˆf(x)]2 | {z } reducible + V ar(ε|X = x) | {z } irreducible . Nearest neighbor averaging can be pretty good for small dimension and large number of observations – i.e. p ≤ 4 and large-ish N. Nearest neighbor methods can be lousy when p is large. Reason: the curse of dimensionality. Nearest neighbors tend to be far away in high dimensions, so we lose the spirit of estimating E[Y |X = x] by local averaging. Although it is almost never correct, a linear model fL(X) often serves as a good and interpretable approximation to the unknown true function f(X). More flexible regression models include thin-plate spline ˆfS. Some trade-offs: • Prediction accuracy versus interpretability. • Good fit versus over-fit or under-fit: how do we know when the fit is just right? • Parsimony versus black-box. Increasing in flexibility and decreasing in interpretability: Subset Selection, Lasso −→ Least Squares −→ Generalized Additive Models, Trees −→ Bagging, Boosting, Support Vector Machines. Assessing model accuracy: Suppose we fit a model ˆf(x) to some training data T r = {xi , yi} N 1 , and we wish to see how well it performs. We could compute the average squared prediction error over T r: MSET r = Avei∈T r[yi − ˆf(xi)]2 . This may be biased toward more overfit models. Instead we should, if possible, compute it using fresh test data T e = {xi , yi}M 1 : MSET e = Avei∈T e[yi − ˆf(xi)]2 . 2 In practice, one can usually compute the training MSE with relative ease, but estimating test MSE is considerably more difficult because usually no test data are available. There are a variety of approaches that can be used in practice to estimate the point where the minimum test MSE is achieved. One important method is cross-validation, which is a method for estimating test MSE using the training data. Bias-variance trade-off: Suppose we have fit a model ˆf(x) to some training data Tr and let (x0, y0) be a test observation drawn from the population. If the true model is Y = f(X) +ε (with f(x) = E[Y |X = x]), then E[y0 − ˆf(x0)]2 = V ar( ˆf(x0)) + [Bias( ˆf(x0))]2 + V ar(ε). Here the notation E[y0 − ˆf(x0)]2 defines the expected test MSE, and refers to the average test MSE that we would obtain if we repeatedly estimated f using a large number of training sets, and tested each at x0. The overall expected test MSE can be computed by averaging E[y0 − ˆf(x0)]2 over all possible values of x0 in the test set. Note that Bias( ˆf(x0)) = E[ ˆf(x0)] − f(x0). Typically as the flexibility of ˆf increases, its variance increases, and its bias decreases. So choosing the flexibility based on average test error amounts to a bias-variance trade-off. Classification problems: Here the response variable Y is qualitative with discrete values in C. Our goals are to build a classifier C(X) that assigns a class label from C to a future unlabeled observation X. Is there an ideal C(X)? Suppose the K elements in C are numbered 1, 2, · · · , K. Let pk(x) = P r(Y = k|X = x), k = 1, 2, · · · , K. Then the Bayes optimal classifier at x is C(x) = j if pj (x) = max{p1(x), p2(x), · · · , pK(x)}. Nearest-neighbor averaging can be used as before, which also breaks down as dimension grows. However, the impact on Cˆ(x) is less than on pˆk(x), k = 1, · · · , K. Typically we measure the performance of Cˆ(x) using the misclassification error rate: ErrT e = Avei∈T eI{yi̸=Cˆ(xi)} . The Bayes classifier (using the true pk(x)) has smallest error (in the population). Support-vector machines build structured models for C(x); we will also build structured models for representing the pk(x), e.g. logistic regression, generalized additive models.